from  pdf_convert_png import *;
from png_convert_word import*;
from word_convert_XML import *;
from png_convert_word import word_list;
from setting import*;
import easyocr
 
 
 
def main():
    print("................PDF To PNG................");
    pdf_files = reload_fileNames(pdf_path,'pdf');
    #对文件名称排序
    # pdf_files = sorted(pdf_files,key = lambda i:int(re.match(r'(\d+)',i).group()))
    #print(pdf_files);
    count = 0;
    for p in pdf_files:
        #print("当前转换pdf:{}".format(p));
        pdf_convert_png(pdf_path,p, image_path,count)
        count = count + 1; #计算pdf数量
    print("[.]抓取pdf文件个数:{}".format(count));
    print("................PNG to Word................")
    png_files = reload_fileNames(image_path,'png')
    # #print(png_files);
    for png_file_name in png_files:
        png_easyocr_convert_word(png_file_name,image_path);
        # break
    #     #input();
    print("识别单词个数:",len(word_list));

    word_list_ = list(set(word_list));
    print("去重复后",len(word_list_));
 
    word_convert_xml(word_list_,'test.xml')
    print("总单词个数:",len(word_list_));
 
 
 
if __name__ == "__main__":
    main();